library(readr)
library(stringr)
library(dplyr)
library(here)
library(ggplot2)
library(mogo)
library(clusterProfiler)
library(viridis)
Let’s set the paths to the directory containing the raw data and to the directory to which we will save our results.
raw_data_path = paste(here(), "data", "raw", sep = "/")
result_path = paste(here(), "results", sep = "/")
graphics_path = paste(here(), "graphics", sep = "/")
Read data from csv file and select the transcript identifiers
(e.g. MGG_07200T0).
cluster1 = read_csv('/Users/jegoussc/GitHub/PD_PRO_1499_06122021_PRM2/results/010_kmeans_cluster_ 1 _on_guy11_and_dpmk1.csv') %>%
dplyr::select(id)
Convert transcript identifier to gene identifier (e.g. from
MGG_07200T0 to MGG_07200) and collapse
duplicate gene identifiers: for example, transcript
MGG_07200T0 (gene MGG_07200) appears twice in
cluster 1 because it correspond to a protein with two phosphorylation
sites
([1-38]-1xMet-loss+Acetyl [N-Term];1xPhospho [S20(91.8)]
and
[1-38]-1xMet-loss+Acetyl [N-Term];1xPhospho [T16(96.9)]);
in this case, we only keep the one instance of the transcript/gene
identifier to avoid over-estimating the representation of the GO term(s)
associated with this transcript/gene identifier.
clust1 = cluster1 %>% mutate(gene_ids = str_split(id, " ", n = 2, simplify = TRUE)[,1]) %>%
mutate(gene_ids = str_remove(gene_ids, "T0")) %>%
dplyr::select(gene_ids) %>%
unique() # collapse duplicates of the same gene identifier
Perform the GO enrichment analysis
enrich1 <- do_enrich(clust1$gene_ids)
barplot(enrich1) +
scale_fill_viridis(direction = -1) +
ggplot2::theme(aspect.ratio = 1)
dotplot(enrich1) +
scale_colour_viridis(direction = -1)
enrich1_df = as.data.frame(enrich1) %>%
select(ID, geneID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue)
csv_filename = paste(result_path, "011_cluster1_go_enrichment.csv", sep = "/")
write_csv(enrich1_df, csv_filename)
Read data from csv file.
cluster2 = read_csv('/Users/jegoussc/GitHub/PD_PRO_1499_06122021_PRM2/results/010_kmeans_cluster_ 2 _on_guy11_and_dpmk1.csv') %>%
dplyr::select(id)
Keep only the gene id
clust2 = cluster2 %>% mutate(gene_ids = str_split(id, " ", n = 2, simplify = TRUE)[,1]) %>%
mutate(gene_ids = str_remove(gene_ids, "T0")) %>%
dplyr::select(gene_ids) %>%
unique()
enrich2 <- do_enrich(clust2$gene_ids)
barplot(enrich2) +
scale_fill_viridis(direction = -1) +
ggplot2::theme(aspect.ratio = 1)
dotplot(enrich2) +
scale_colour_viridis(direction = -1)
enrich2_df = as.data.frame(enrich2) %>%
select(ID, geneID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue)
csv_filename = paste(result_path, "011_cluster2_go_enrichment.csv", sep = "/")
write_csv(enrich2_df, csv_filename)
Read data from csv file.
cluster3 = read_csv('/Users/jegoussc/GitHub/PD_PRO_1499_06122021_PRM2/results/010_kmeans_cluster_ 3 _on_guy11_and_dpmk1.csv') %>% dplyr::select(id)
Keep only the gene id
clust3 = cluster3 %>% mutate(gene_ids = str_split(id, " ", n = 2, simplify = TRUE)[,1]) %>%
mutate(gene_ids = str_remove(gene_ids, "T0")) %>%
dplyr::select(gene_ids) %>% unique()
enrich3 <- do_enrich(clust3$gene_ids)
barplot(enrich3) + scale_fill_viridis(direction = -1) + ggplot2::theme(aspect.ratio = 1)
dotplot(enrich3) + scale_colour_viridis(direction = -1)
enrich3_df = as.data.frame(enrich3) %>%
select(ID, geneID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue)
csv_filename = paste(result_path, "011_cluster3_go_enrichment.csv", sep = "/")
write_csv(enrich3_df, csv_filename)
Read data from csv file.
cluster4 = read_csv('/Users/jegoussc/GitHub/PD_PRO_1499_06122021_PRM2/results/010_kmeans_cluster_ 4 _on_guy11_and_dpmk1.csv') %>%
dplyr::select(id)
Keep only the gene id
clust4 = cluster4 %>% mutate(gene_ids = str_split(id, " ", n = 2, simplify = TRUE)[,1]) %>%
mutate(gene_ids = str_remove(gene_ids, "T0")) %>%
dplyr::select(gene_ids) %>%
unique()
enrich4 <- do_enrich(clust4$gene_ids)
barplot(enrich4) +
scale_fill_viridis(direction = -1) +
ggplot2::theme(aspect.ratio = 1)
dotplot(enrich4) +
scale_colour_viridis(direction = -1)
enrich4_df = as.data.frame(enrich4) %>%
select(ID, geneID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue)
csv_filename = paste(result_path, "011_cluster4_go_enrichment.csv", sep = "/")
write_csv(enrich3_df, csv_filename)
Read data from csv file.
cluster5 = read_csv('/Users/jegoussc/GitHub/PD_PRO_1499_06122021_PRM2/results/010_kmeans_cluster_ 5 _on_guy11_and_dpmk1.csv') %>%
dplyr::select(id)
Keep only the gene id
clust5 = cluster5 %>% mutate(gene_ids = str_split(id, " ", n = 2, simplify = TRUE)[,1]) %>%
mutate(gene_ids = str_remove(gene_ids, "T0")) %>%
dplyr::select(gene_ids) %>%
unique()
enrich5 <- do_enrich(clust5$gene_ids)
barplot(enrich5) +
scale_fill_viridis(direction = -1) +
ggplot2::theme(aspect.ratio = 1)
dotplot(enrich5) +
scale_colour_viridis(direction = -1)
enrich5_df = as.data.frame(enrich5) %>%
select(ID, geneID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue)
csv_filename = paste(result_path, "011_cluster5_go_enrichment.csv", sep = "/")
write_csv(enrich5_df, csv_filename)
Read data from csv file.
data_filename = paste(raw_data_path, "Pmk1 direct targets PRM_101122.xlsx", sep = "/")
targets = readxl::read_xlsx(data_filename) %>%
dplyr::select(`Phosphorylation position`) %>% unique()
Keep only the gene id (for now I remove gi number) Replacement:
gi|145610929|ref|XP_368444.2| MST7 = MGG_00800 gi|39964959|ref|XP_365053.1| MAC1 (S75) = MGG_09898
targets2 = targets %>%
dplyr::mutate(gene_ids =
stringr::str_split(`Phosphorylation position`, " ", n = 2, simplify = TRUE)[,1]) %>%
dplyr::mutate(gene_ids = ifelse(`gene_ids` == 'gi|145610929|ref|XP_368444.2|',
'MGG_00800', gene_ids)) %>%
dplyr::mutate(gene_ids = ifelse(`gene_ids` == 'gi|39964959|ref|XP_365053.1|',
'MGG_09898', gene_ids))
In several cases, few selected peptides correspond to the same protein and thus the different lines/rows (peptides) are collapsed so that a single MGG number (gene/protein) is taken into consideration in the GO enrichment analysis resulting in a list of 31 proteins.
gene_ids = targets2 %>%
dplyr::mutate(gene_ids = stringr::str_split(`Phosphorylation position`, " ", n = 2, simplify = TRUE)[,1]) %>%
dplyr::filter(str_detect(gene_ids, "^MGG")) %>%
dplyr::select(gene_ids) %>%
unique()
enrich_targets <- do_enrich(gene_ids$gene_ids)
barplot(enrich_targets) +
#scale_fill_viridis(direction = -1) +
ggplot2::scale_fill_distiller(type = "seq", direction = -1, palette = 'Reds') +
ggplot2::theme(aspect.ratio = 1)
# PRM-GO-dotplot.pdf
dotplot(enrich_targets) +
ggplot2::scale_colour_distiller(type = "seq", direction = -1, palette = 'Reds', limits = c(0, 0.005))
# limits and directions are set so that the more intense the colour is, more smaller/better the p-value is
enrich_targets_df = as.data.frame(enrich_targets) %>%
select(ID, geneID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue)
csv_filename = paste(result_path, "011_targets_go_enrichment.csv", sep = "/")
write_csv(enrich_targets_df, csv_filename)
February 2023: New request from Neftaly.
Let’s summarise all the go enrichement analysis in a single dot plot.
First, for all the dataframe build previously, we add a column with the cluster number (as categorial variable, not numerical).
enrich1_df$cluster = "1"
enrich2_df$cluster = "2"
enrich3_df$cluster = "3"
enrich4_df$cluster = "4"
enrich5_df$cluster = "5"
Now we can bind all these data frames together.
enrichment_summary_df = rbind(enrich1_df, enrich2_df,
enrich3_df, enrich4_df, enrich5_df)
We transform the current gene ratio (50/100) to its numerical value (0.5).
enrichment_summary_df = enrichment_summary_df |> mutate(
gene.ratio = sapply(strsplit(enrichment_summary_df$GeneRatio, "/"),
function(x) as.numeric(x[1])/as.numeric(x[2])))
We draw th plot
ggplot(enrichment_summary_df,
aes(x = cluster, y = Description, size = gene.ratio, colour = p.adjust)) +
geom_point()
plot_filename = paste(graphics_path, "011-go-enrichment-summary.pdf", sep = "/")
ggsave(file = plot_filename,
plot = last_plot(), width = 10, height = 7)
Sort out these GO terms!
library(GO.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:clusterProfiler':
##
## rename
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:clusterProfiler':
##
## slice
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:clusterProfiler':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
library(viridis)
ontology = unlist(Ontology(GOTERM))
enrichment_summary_df$ont = ontology[enrichment_summary_df$ID]
enrichment_summary_df = enrichment_summary_df |> mutate(
labels = paste0(Description, " (", ID, ")")
)
ggplot(enrichment_summary_df,
aes(x = cluster, y = labels, size = gene.ratio, colour = p.adjust)) +
#scale_color_viridis(direction = -1) +
geom_point() + facet_grid(ont~., scales = "free", space = "free")
plot_filename = paste(graphics_path, "011-go-enrichment-summary-v1.pdf", sep = "/")
ggsave(file = plot_filename,
plot = last_plot(), width = 10, height = 7)
What are these GO terms that are displayed as “NA”.
After discussion with Neftaly, we will remove the obsolete term and move the ATP-dependant microtubule to MF:
cleaned_enrichment_summary_df = enrichment_summary_df %>%
filter(labels != "obsolete signal transducer activity (GO:0004871)") %>%
mutate(ont = ifelse(labels == "ATP-dependent microtubule motor activity (GO:1990939)", "MF", ont))
ggplot(cleaned_enrichment_summary_df,
aes(x = cluster, y = labels, size = gene.ratio, colour = p.adjust)) +
scale_color_viridis(direction = -1) +
geom_point() +
facet_grid(ont~., scales = "free", space = "free")
plot_filename = paste(graphics_path, "011-go-enrichment-summary-v2.pdf", sep = "/")
ggsave(file = plot_filename,
plot = last_plot(), width = 7, height = 10)
ggplot(enrichment_summary_df,
aes(
#x = cluster,
x = ordered(cluster, levels = c("5", "4", "3", "2", "1")),
y = labels, size = gene.ratio, colour = p.adjust)) +
scale_color_viridis(option = "B", direction = -1) +
geom_point() + facet_grid(~ont, scales = "free", space = "free") +
coord_flip() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + labs(x = "clusters")
plot_filename = paste(graphics_path, "011-go-enrichment-summary-v3.pdf", sep = "/")
ggsave(file = plot_filename,
plot = last_plot(), width = 10, height = 7)